import pandas as pd
import numpy as np
import klib
import pandas_profiling as pp
import sweetviz
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import *
from sklearn.metrics import *
from sklearn import set_config
from sklearn.utils import estimator_html_repr
from IPython.core.display import display, HTML
set_config(display='diagram')
from platform import python_version
%precision 2
pd.set_option('max_columns',200)
pd.set_option('display.precision',2)
pd.set_option('display.float_format','{:,.2f}'.format)
print('python',python_version())
print(np.__name__, np.__version__)
print(pd.__name__, pd.__version__)
print(klib.__name__, klib.__version__)
#print(pp.__name__, pp.__version__)
print(sklearn.__name__, sklearn.__version__)
print(sweetviz.__name__, sweetviz.__version__)
python 3.9.2 numpy 1.20.1 pandas 1.1.4 klib 0.1.5 sklearn 0.24.1 sweetviz 2.0.9
%%html
<style>
.jp-RenderedMarkdown {
font-size:130%;
}
.CodeMirror, .CodeMirror pre, .CodeMirror-dialog, .CodeMirror-dialog .CodeMirror-search-field, .terminal-app .terminal {
font-size: 80%;
}
</style>
def plot_silhouettes(data, clusters, metric='euclidean'):
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(clusters)
n_clusters = cluster_labels.shape[0]
silhouette_vals = metrics.silhouette_samples(data, clusters, metric='euclidean')
c_ax_lower, c_ax_upper = 0, 0
cticks = []
for i, k in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[clusters == k]
c_silhouette_vals.sort()
c_ax_upper += len(c_silhouette_vals)
color = cm.jet(float(i) / n_clusters)
plt.barh(range(c_ax_lower, c_ax_upper), c_silhouette_vals, height=1.0,
edgecolor='none', color=color)
cticks.append((c_ax_lower + c_ax_upper) / 2)
c_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--")
plt.yticks(cticks, cluster_labels)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.tight_layout()
plt.show()
return
def plot_dendrogram(model, **kwargs):
from scipy.cluster.hierarchy import dendrogram
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
| Variable | Type | Description |
|---|---|---|
| Clientnum | Num | Client number. Unique identifier for the customer holding the account |
| Attrition_Flag | Char | Internal event (customer activity) variable - if the account is closed then 1 else 0 |
| Customer_Age | Num | Demographic variable - Customer's Age in Years |
| Gender | Char | Demographic variable - M=Male, F=Female |
| Dependent_count | Num | Demographic variable - Number of dependents |
| Education_Level | Char | Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.) |
| Marital_Status | Char | Demographic variable - Married, Single, Unknown |
| Income_Category | Char | Demographic variable - Annual Income Category of the account holder (< $40K, $40K - 60K, $60K - $80K, $80K-$120K, > $120K, Unknown) |
| Card_Category | Char | Product Variable - Type of Card (Blue, Silver, Gold, Platinum) |
| Months_on_book | Num | Months on book (Time of Relationship) |
| Total_Relationship_Count | Num | Total no. of products held by the customer |
| Months_Inactive_12_mon | Num | No. of months inactive in the last 12 months |
| Contacts_Count_12_mon | Num | No. of Contacts in the last 12 months |
| Credit_Limit | Num | Credit Limit on the Credit Card |
| Total_Revolving_Bal | Num | Total Revolving Balance on the Credit Card |
| Avg_Open_To_Buy | Num | Open to Buy Credit Line (Average of last 12 months) |
| Total_Amt_Chng_Q4_Q1 | Num | Change in Transaction Amount (Q4 over Q1) |
| Total_Trans_Amt | Num | Total Transaction Amount (Last 12 months) |
| Total_Trans_Ct | Num | Total Transaction Count (Last 12 months) |
| Total_Ct_Chng_Q4_Q1 | Num | Change in Transaction Count (Q4 over Q1) |
| Avg_Utilization_Ratio | Num | Average Card Utilization Ratio |
The train dataset will be used for analysis.
# Read [Train] data from CSV
df_train = pd.read_csv('./data/bank_churners_train.csv', index_col=0)
df_train.card_category = pd.Categorical(
df_train.card_category,
['Blue' , 'Silver' ,'Gold' , 'Platinum'],
ordered=True)
df_train.income_category = pd.Categorical(
df_train.income_category,
['Unknown' , 'Less than $40K', '$40K - $60K', '$60K - $80K' ,'$80K - $120K' , '$120K +'],
ordered=True)
df_train.education_level = pd.Categorical(
df_train.education_level,
['Unknown' , 'Uneducated', 'High School', 'College' ,'Graduate' , 'Post-Graduate', 'Doctorate'],
ordered=True)
df_train.gender = pd.Categorical(df_train.gender)
df_train.marital_status = pd.Categorical(df_train.marital_status)
X = df_train.iloc[:,:-1]
y = df_train.attrited_customer
print('X and y shapes:')
print(X.shape,y.shape,'\n')
print(df_train.info())
y.value_counts(normalize=True, dropna=False).plot.bar(title='attrited_customer distribution');
X and y shapes: (8101, 19) (8101,) <class 'pandas.core.frame.DataFrame'> Int64Index: 8101 entries, 3066 to 1919 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customer_age 8101 non-null int64 1 dependent_count 8101 non-null int64 2 months_on_book 8101 non-null int64 3 total_relationship_count 8101 non-null int64 4 months_inactive_12_mon 8101 non-null int64 5 contacts_count_12_mon 8101 non-null int64 6 credit_limit 8101 non-null float64 7 total_revolving_bal 8101 non-null int64 8 avg_open_to_buy 8101 non-null float64 9 total_amt_chng_q4_q1 8101 non-null float64 10 total_trans_amt 8101 non-null int64 11 total_trans_ct 8101 non-null int64 12 total_ct_chng_q4_q1 8101 non-null float64 13 avg_utilization_ratio 8101 non-null float64 14 gender 8101 non-null category 15 education_level 8101 non-null category 16 marital_status 8101 non-null category 17 income_category 8101 non-null category 18 card_category 8101 non-null category 19 attrited_customer 8101 non-null bool dtypes: bool(1), category(5), float64(5), int64(9) memory usage: 997.8 KB None
y.value_counts(normalize=True, dropna=False)
False 0.84 True 0.16 Name: attrited_customer, dtype: float64
I can see imbalanced target values at 16%
This means that accuracy scores for classification will be misleading, we will need to consider F1, Recall, and Precision scores.
Since, the target variable explains attrited customers, then we can be more tolerant with Type 1 error vs Type 2, where we might mis-classify customers to be attrited and pro-actively approached them to retain them instead of ignoring them and losing them.
Accordingly, while seeking the highest F1 score to achieve a reasonable balance, we should be favoring better recalls.
X.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| customer_age | 8,101.00 | 46.30 | 7.98 | 26.00 | 41.00 | 46.00 | 52.00 | 73.00 |
| dependent_count | 8,101.00 | 2.35 | 1.30 | 0.00 | 1.00 | 2.00 | 3.00 | 5.00 |
| months_on_book | 8,101.00 | 35.93 | 7.95 | 13.00 | 31.00 | 36.00 | 40.00 | 56.00 |
| total_relationship_count | 8,101.00 | 3.81 | 1.56 | 1.00 | 3.00 | 4.00 | 5.00 | 6.00 |
| months_inactive_12_mon | 8,101.00 | 2.35 | 1.01 | 0.00 | 2.00 | 2.00 | 3.00 | 6.00 |
| contacts_count_12_mon | 8,101.00 | 2.45 | 1.11 | 0.00 | 2.00 | 2.00 | 3.00 | 6.00 |
| credit_limit | 8,101.00 | 8,634.91 | 9,056.86 | 1,438.30 | 2,568.00 | 4,589.00 | 11,127.00 | 34,516.00 |
| total_revolving_bal | 8,101.00 | 1,162.16 | 814.18 | 0.00 | 382.00 | 1,276.00 | 1,782.00 | 2,517.00 |
| avg_open_to_buy | 8,101.00 | 7,472.75 | 9,055.96 | 3.00 | 1,341.00 | 3,515.00 | 9,921.00 | 34,516.00 |
| total_amt_chng_q4_q1 | 8,101.00 | 0.76 | 0.22 | 0.00 | 0.63 | 0.74 | 0.86 | 3.40 |
| total_trans_amt | 8,101.00 | 4,421.09 | 3,431.04 | 510.00 | 2,162.00 | 3,895.00 | 4,746.00 | 18,484.00 |
| total_trans_ct | 8,101.00 | 64.94 | 23.52 | 10.00 | 45.00 | 67.00 | 81.00 | 139.00 |
| total_ct_chng_q4_q1 | 8,101.00 | 0.72 | 0.24 | 0.00 | 0.58 | 0.71 | 0.82 | 3.71 |
| avg_utilization_ratio | 8,101.00 | 0.27 | 0.27 | 0.00 | 0.02 | 0.18 | 0.50 | 1.00 |
features_numeric = list(df_train.columns[:14])
features_nominal = ['gender','marital_status']
features_nominal_ohe = ['is_Female', 'is_Male', 'is_Divorced', 'is_Married', 'is_Single','is_Unknown_Marital'] # OneHotEncoded
features_ordinal = ['education_level','income_category','card_category']
features_names = df_train.columns[:19].values
features_names_enc = features_numeric + features_ordinal + features_nominal_ohe
numeric_transformer = StandardScaler()
ordinal_transformer = OrdinalEncoder()
nominal_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, features_numeric),
('ord', ordinal_transformer, features_ordinal),
('nom', nominal_transformer, features_nominal),
],
remainder='passthrough',
n_jobs=-1
)
X_ = preprocessor.fit_transform(X)
X_enc = pd.DataFrame(X_, columns=features_names_enc, index=X.index)
X_enc.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| customer_age | 8,101.00 | 0.00 | 1.00 | -2.55 | -0.66 | -0.04 | 0.71 | 3.35 |
| dependent_count | 8,101.00 | 0.00 | 1.00 | -1.82 | -1.04 | -0.27 | 0.50 | 2.04 |
| months_on_book | 8,101.00 | -0.00 | 1.00 | -2.89 | -0.62 | 0.01 | 0.51 | 2.53 |
| total_relationship_count | 8,101.00 | 0.00 | 1.00 | -1.80 | -0.52 | 0.12 | 0.76 | 1.40 |
| months_inactive_12_mon | 8,101.00 | 0.00 | 1.00 | -2.32 | -0.35 | -0.35 | 0.64 | 3.61 |
| contacts_count_12_mon | 8,101.00 | 0.00 | 1.00 | -2.21 | -0.41 | -0.41 | 0.50 | 3.21 |
| credit_limit | 8,101.00 | 0.00 | 1.00 | -0.79 | -0.67 | -0.45 | 0.28 | 2.86 |
| total_revolving_bal | 8,101.00 | 0.00 | 1.00 | -1.43 | -0.96 | 0.14 | 0.76 | 1.66 |
| avg_open_to_buy | 8,101.00 | 0.00 | 1.00 | -0.82 | -0.68 | -0.44 | 0.27 | 2.99 |
| total_amt_chng_q4_q1 | 8,101.00 | -0.00 | 1.00 | -3.43 | -0.59 | -0.11 | 0.45 | 11.89 |
| total_trans_amt | 8,101.00 | -0.00 | 1.00 | -1.14 | -0.66 | -0.15 | 0.09 | 4.10 |
| total_trans_ct | 8,101.00 | -0.00 | 1.00 | -2.34 | -0.85 | 0.09 | 0.68 | 3.15 |
| total_ct_chng_q4_q1 | 8,101.00 | 0.00 | 1.00 | -2.94 | -0.54 | -0.04 | 0.44 | 12.29 |
| avg_utilization_ratio | 8,101.00 | 0.00 | 1.00 | -1.00 | -0.91 | -0.35 | 0.82 | 2.65 |
| education_level | 8,101.00 | 3.09 | 1.84 | 0.00 | 2.00 | 3.00 | 5.00 | 6.00 |
| income_category | 8,101.00 | 2.87 | 1.51 | 0.00 | 2.00 | 3.00 | 4.00 | 5.00 |
| card_category | 8,101.00 | 0.17 | 0.68 | 0.00 | 0.00 | 0.00 | 0.00 | 3.00 |
| is_Female | 8,101.00 | 0.53 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 |
| is_Male | 8,101.00 | 0.47 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| is_Divorced | 8,101.00 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| is_Married | 8,101.00 | 0.46 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| is_Single | 8,101.00 | 0.39 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
| is_Unknown_Marital | 8,101.00 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
Let's check correlations and only showing the highly correlated pairs having greater than abs(.6)
corr = X_enc.join(y).corr() # Join with y to be included in correlations
fig, ax = plt.subplots(figsize=(12, 12))
mask = np.triu(np.ones_like(corr, dtype=bool))
hm = sns.heatmap(corr, annot=True, center=0, linecolor='lightgray', linewidths=.01, ax=ax,
cmap="coolwarm", fmt='.2g', square=True, cbar=False,
mask=((abs(corr)<=.6) | mask))
plt.title('Correlation Heatmap (r>abs(.6))', fontsize=24);
Let's check the pairplot and see if we can see any linear relations, or notice unique distributions (double click the pairplot to read details)
sns.pairplot(X_enc.join(y), vars=features_numeric+features_ordinal, hue='attrited_customer', corner=True);